Using the data collected from existing customers, build a model that will help the marketing team identify potential customers who are relatively more likely to subscribe term deposit and thus increase their hit ratio.
The historical data for this project is available in file https://archive.ics.uci.edu/ml/datasets/Bank+Marketing
Bank client data:
#Import all the necessary modules
import warnings
warnings.filterwarnings('ignore')
#Load Libraries
#!pip install pandas_profiling
import pandas as pd
import pandas_profiling
import numpy as np
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix,recall_score, precision_score, f1_score, roc_auc_score,accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline
# Remove scientific notations and display numbers with 2 decimal points instead
pd.options.display.float_format = '{:,.2f}'.format
num_bins = 10
bank_df = pd.read_csv('bank-full.csv')
bank_df.head()
bank_df.info()
bank_df.isna().sum()
#size
bank_df.shape
bank_df.describe().transpose()
bank_df.columns.values
#Number of unique in each column
bank_df.nunique()
def df_col_analysis(df):
for name in df.columns:
print ("----------")
print ("Col Name: ", name)
print ("Col type: ", df[name].dtype)
if df[name].dtype is np.dtype('O'):
print (df[name].value_counts())
else:
print (df[name].describe())
df_col_analysis(bank_df)
bank_df.profile_report()
cont_variables=['age','balance','day','duration','campaign','pdays','previous']
bool_vars=['default','housing','loan','target']
categorcial_variables = ['job', 'marital', 'education', 'default', 'loan', 'contact', 'month', 'day', 'poutcome']
for col in categorcial_variables:
plt.figure(figsize=(10,4))
sns.barplot(bank_df[col].value_counts().values, bank_df[col].value_counts().index)
plt.title(col)
plt.tight_layout()
def countplot(label, dataset):
plt.figure(figsize=(15,10))
ax=sns.countplot(x=label, data=dataset)
plt.show()
countplot("job", bank_df)
bank_df['education'].value_counts()
bank_df["Target"].value_counts()/bank_df["Target"].count()
# Another way to plot a histogram of duration is shown below
bank_df['duration'].hist(bins=50)
plt.figure(figsize=(10,8))
sns.distplot(bank_df["age"])
plt.figure(figsize=(10,8))
sns.distplot(bank_df["duration"])
plt.show()
plt.figure(figsize=(10,8))
sns.distplot(bank_df["campaign"])
plt.show()
bank_df["pdays"].value_counts()
sns.boxplot(data=bank_df, x="Target", y="pdays")
plt.show()
plt.figure(figsize=(10,8))
sns.distplot(bank_df[bank_df["Target"]=="yes"]["pdays"])
sns.distplot(bank_df[bank_df["Target"]=="no"]["pdays"])
plt.show()
bank_df["previous"].value_counts()
countplot("previous", bank_df)
bank_df["education"].value_counts()
def barGraph(data, col1, col2 ):
msyes=data[col1][data[col2]=='yes'].value_counts()
msno=data[col1][data[col2]=='no'].value_counts()
ms=pd.concat([msyes,msno],axis=1)
ms.columns=['accepted','rejected']
print(ms)
ms.plot(kind='bar')
barGraph(bank_df, 'marital','Target')
barGraph(bank_df, 'education','Target')
countplot("housing", bank_df)
plt.figure(figsize=(10,8))
ax = sns.heatmap(bank_df.corr(),
annot=True,
linewidths=.5,
center=0,
cbar=False,
cmap="YlGnBu")
plt.title('Correlation')
plt.show()
bank_df.groupby(["job", "Target"])['job'].count()
sns.pairplot(bank_df, hue = 'Target')
sns.boxplot(data=bank_df, x="Target", y="duration")
plt.show()
sns.boxplot(data=bank_df, x="Target", y="campaign")
plt.show()
#People who converted were exposed to fewer campaigns than others
sns.boxplot(data=bank_df, x="Target", y="age")
plt.show()
bank_df.groupby(["campaign", "Target"])['campaign'].count()
bank_df.groupby(["education", "Target"])['education'].count()
bank_df.groupby(["month", "Target"])['month'].count()
#Categorize the column of dataset which is object type
for col in bank_df.columns:
if bank_df[col].dtype == object:
bank_df[col] = bank_df[col].astype('category')
bank_df.isnull().sum()
bank_df.info()
# values for "Target" : yes/no
bank_df["Target_cat"] = bank_df['Target'].map({'yes':1, 'no':0})
data_x = bank_df.iloc[:, :-2]
print("Shape of X:", data_x.shape)
data_y = bank_df["Target_cat"]
print("Shape of Y:", data_y.shape)
data_y.head()
#month can be category code values
data_x['month'] = data_x['month'].astype("category").cat.codes
# most of data of poutcome is unknown so it is not useful
data_x.drop('poutcome', axis=1, inplace=True)
# Drop 'contact', as every participant has been contacted.
data_x.drop('contact', axis=1, inplace=True)
#Map padys=-1 into a large value (10000 is used) to indicate
#that it is so far in the past that it has no effect
data_x.loc[data_x['pdays'] == -1, 'pdays'] = 10000
# values for "default" : yes/no
data_x['default_cat'] = data_x['default'].map( {'yes':1, 'no':0} ).astype("int")
data_x.drop('default', axis=1,inplace = True)
# values for "housing" : yes/no
data_x['housing_cat'] = data_x['housing'].map( {'yes':1, 'no':0} ).astype("int")
data_x.drop('housing', axis=1,inplace = True)
# values for "loan" : yes/no
data_x['loan_cat'] = data_x['loan'].map( {'yes':1, 'no':0} ).astype("int")
data_x.drop('loan', axis=1,inplace = True)
# Drop 'day' as they don't have any intrinsic meaning
data_x.drop('day', axis=1, inplace=True)
# Merge similar jobs into broad categories
data_x['job'] = data_x['job'].replace(['management', 'admin.'], 'white-collar')
data_x['job'] = data_x['job'].replace(['services','housemaid'], 'pink-collar')
data_x['job'] = data_x['job'].replace(['retired', 'student', 'unemployed', 'unknown'], 'other')
data_x.info()
# Convert categorical variables to dummies
data_x = pd.get_dummies(data=data_x, columns = ['job', 'marital', 'education' ], \
prefix = ['job', 'marital', 'education'])
data_x.head()
data_x.shape
data_x.info()
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=.3, random_state=22)
X_train.shape, X_test.shape
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train= sc.fit_transform(X_train)
X_test= sc.fit_transform(X_test)
X_train.shape, X_test.shape
## function to get confusion matrix in a proper format
def draw_cm( actual, predicted ):
cm = confusion_matrix( actual, predicted)
sns.heatmap(cm, annot=True, fmt='.2f', xticklabels = [0,1] , yticklabels = [0,1] )
plt.ylabel('Observed')
plt.xlabel('Predicted')
plt.show()
results_df = pd.DataFrame()
## function to get confusion matrix in a proper format
def print_metrics(results_df, testName, reg, actual, predicted ):
tr_acc=reg.score(X_train,y_train)
tst_acc=reg.score(X_test, y_test)
rc_scr = recall_score(actual,predicted);
pr_scr = precision_score(actual,predicted);
f1_scr= f1_score(actual,predicted)
roc_scr = roc_auc_score(actual,predicted);
print("Training accuracy",tr_acc)
print()
print("Testing accuracy",tst_acc)
print()
print('Confusion Matrix')
cm = confusion_matrix( actual, predicted)
sns.heatmap(cm, annot=True, fmt='.2f', xticklabels = [0,1] , yticklabels = [0,1] )
plt.ylabel('Observed')
plt.xlabel('Predicted')
plt.show()
print()
print("Recall:",rc_scr)
print()
print("Precision:",pr_scr)
print()
print("F1 Score:",f1_scr)
print()
print("Roc Auc Score:",roc_scr)
resultMap= {'1_TestName' : testName,'TrainingAccuracy' : tr_acc,
'TestingAccuracy' : tst_acc,'RecallScore': rc_scr,'PrecisionScore': pr_scr,'F1Score':f1_scr,
'ROCScore':roc_scr}
results_df=results_df.append(resultMap,ignore_index=True)
return results_df;
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
results_df= print_metrics(results_df,"LogisticRegression",logreg, y_test,y_pred)
# Running a loop to check different values of 'solver'
# all solver can be used with l2, only 'liblinear' and 'saga' works with both 'l1' and 'l2'
train_score=[]
test_score=[]
recall_score_list=[]
solver = ['newton-cg','lbfgs','liblinear','sag','saga']
for i in solver:
model = LogisticRegression(random_state=42,penalty='l2', C = 0.75,solver=i) # changing values of solver
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
train_score.append(round(model.score(X_train, y_train),3))
test_score.append(round(model.score(X_test, y_test),3))
recall_score_list.append(round(recall_score(y_test,y_predict),3))
print(solver)
print()
print(train_score)
print()
print(test_score)
print()
print(recall_score_list)
from sklearn.tree import DecisionTreeClassifier
#Using default 'gini' criteria to split. Other option include 'entropy'.
dTree = DecisionTreeClassifier(criterion = 'gini', random_state=1)
dTree.fit(X_train, y_train)
y_pred = dTree.predict(X_test)
results_df= print_metrics(results_df,"DecisionTreeClassifier",dTree, y_test,y_pred)
from sklearn.ensemble import BaggingClassifier
bgcl = BaggingClassifier(base_estimator=dTree, n_estimators=50,random_state=1)
#bgcl = BaggingClassifier(n_estimators=50,random_state=1)
bgcl = bgcl.fit(X_train, y_train)
y_predict = bgcl.predict(X_test)
results_df= print_metrics(results_df, "BaggingClassifier",bgcl, y_test,y_pred)
from sklearn.ensemble import AdaBoostClassifier
abcl = AdaBoostClassifier(n_estimators=10, random_state=1)
#abcl = AdaBoostClassifier( n_estimators=50,random_state=1)
abcl = abcl.fit(X_train, y_train)
y_predict = abcl.predict(X_test)
results_df= print_metrics(results_df,"AdaBoostClassifier",abcl, y_test,y_pred)
from sklearn.ensemble import GradientBoostingClassifier
gbcl = GradientBoostingClassifier(n_estimators = 50,random_state=1)
gbcl = gbcl.fit(X_train, y_train)
y_predict = gbcl.predict(X_test)
results_df= print_metrics(results_df,"GradientBoostingClassifier",gbcl, y_test,y_pred)
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(n_estimators = 50, random_state=1,max_features=12)
rfcl = rfcl.fit(X_train, y_train)
y_predict = rfcl.predict(X_test)
results_df= print_metrics(results_df,"RandomForestClassifier",rfcl, y_test,y_pred)
results_df.T
The campaign's goal is to have more people to subscribe term deposit i.e. less number of False Negative, so it don't lose customers who want to subscribe term deposit.
Types of error:
In this use case, we need to reduce Type II error to not lose the people who would subscribe term deposits. Hence, Recall is MORE important then precision for our usecase.
Based on above metrics of various algorithms,